A l’aide des fonctions sample, rnorm et
runif, on génère un échantillon de taille , en considérant ici où , , .
pi<-0.90
mu<-2
sig<- 1
a=10;
c<-1/(2*a)
n<- 1000
x<-vector("numeric",n)
y<-x
for(i in 1:n){
y[i]=sample(c(1,0),size=1,prob=c(pi,1-pi))
if(y[i]==1) x[i]<- rnorm(1,mean=mu,sd=sig) else x[i]<-runif(1,min=-a,max=a)
}
boxplot(x)
dotchart(x)
L’objectif de cette question est de coder un algorithme EM pour ce problème. Pour cela, on pour s’aider du document “EM calculs” sur Moodle ainsi que des slides 27–32 du cours.
# Observed data Loglikelihood
loglik<- function(theta,x){
phi <- dnorm(x,mean=theta[1],sd=theta[2])
logL <- sum(log(theta[3]*phi+(1-theta[3])*c))
return(logL)
}
# EM algorithm
em_outlier <- function(x,theta0,a,epsi){
go_on<-TRUE
logL0<- loglik(theta0,x)
t<-0
c<-1/(2*a)
n<-length(x)
print(c(t,logL0))
while(go_on){
t<-t+1
# E-step
phi <- dnorm(x,mean=theta0[1],sd=theta0[2])
y<- phi*theta0[3]/(phi*theta0[3]+c*(1-theta0[3]))
# M-step
S<- sum(y)
pi<-S/n
mu<- sum(x*y)/S
sig<-sqrt(sum(y*(x-mu)^2)/S)
theta<-c(mu,sig,pi)
logL<-loglik(theta,x)
if (logL-logL0 < epsi){
go_on <- FALSE
}
logL0 <- logL
theta0<-theta
print(c(t,logL))
}
return(list(loglik=logL,theta=theta,y=y))
}
On va maintenant appliquer cet algorithme sur les données avec différentes initialisations.
mu0<-mean(x)
sig0<-sd(x)
pi0<-0.5
epsi<-1e-8
theta0<-c(mu0,sig0,pi0)
estim<-em_outlier(x,theta0,a,epsi)
## [1] 0.000 -2250.246
## [1] 1.000 -1789.902
## [1] 2.000 -1721.572
## [1] 3.000 -1714.053
## [1] 4.000 -1712.578
## [1] 5.000 -1712.265
## [1] 6.000 -1712.201
## [1] 7.000 -1712.188
## [1] 8.000 -1712.186
## [1] 9.000 -1712.185
## [1] 10.000 -1712.185
## [1] 11.000 -1712.185
## [1] 12.000 -1712.185
## [1] 13.000 -1712.185
## [1] 14.000 -1712.185
## [1] 15.000 -1712.185
## [1] 16.000 -1712.185
plot(x,1-estim$y)
opt <- optim(theta0, loglik, method="L-BFGS-B", control=list(fnscale=-1), lower = c(-10, 0, 0.01), upper = c(10, 10, .99), x=x)
print(opt$par) # estimates computed with optim
## [1] 1.9781772 0.9872289 0.9187138
print(estim$theta) # estimates computed with EM
## [1] 1.9781778 0.9872292 0.9187172
library(mclust)
## Package 'mclust' version 6.0.0
## Type 'citation("mclust")' for citing this R package in publications.
data<-read.table('wine.txt',header=FALSE)
x<-data[,2:14]
plot(x[,1:7],col=data[,1],pch=data[,1])
plot(x[,8:13],col=data[,1],pch=data[,1])
wine <- Mclust(x)
summary(wine)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 3 components:
##
## log-likelihood n df BIC ICL
## -3015.333 178 158 -6849.387 -6850.694
##
## Clustering table:
## 1 2 3
## 59 69 50
plot(wine,what="BIC")
plot(wine,what="classification",dimens=1:7)
plot(wine,what="uncertainty",dimens=1:7)
table(data[,1],wine$classification)
##
## 1 2 3
## 1 59 0 0
## 2 0 69 2
## 3 0 0 48
adjustedRandIndex(data[,1],wine$classification)
## [1] 0.9667411
data<-read.table('ecoli.txt',header=FALSE)
x<-data[,-c(1,4,5,8,9)]
# Real classes
class.new = as.integer(as.factor(data[,9]))
plot(x,col=class.new)
ecoli <- Mclust(x)
summary(ecoli)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EVI (diagonal, equal volume, varying shape) model with 3 components:
##
## log-likelihood n df BIC ICL
## 811.4761 336 24 1483.341 1464.26
##
## Clustering table:
## 1 2 3
## 145 78 113
table(class.new,ecoli$classification)
##
## class.new 1 2 3
## 1 139 4 0
## 2 2 4 71
## 3 0 1 1
## 4 0 0 2
## 5 0 1 34
## 6 0 20 0
## 7 0 4 1
## 8 4 44 4
plot(ecoli,what="classification")
adjustedRandIndex(class.new,ecoli$classification)
## [1] 0.6968981
data<-read.table('seeds.txt',header=FALSE)
x<-data[,1:7]
seeds <- Mclust(x)
summary(seeds)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEV (ellipsoidal, equal volume and shape) model with 4 components:
##
## log-likelihood n df BIC ICL
## 1323.903 210 122 1995.46 1991.099
##
## Clustering table:
## 1 2 3 4
## 59 56 53 42
plot(seeds)
Some inputs variables are strongly correlated; it may be preferable to extract principal components.
pca<-princomp(x,cor=TRUE)
z<-pca$scores[,1:5]
seeds1 <- Mclust(z)
summary(seeds1)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EVE (ellipsoidal, equal volume and orientation) model with 3 components:
##
## log-likelihood n df BIC ICL
## -797.2031 210 40 -1808.29 -1819.132
##
## Clustering table:
## 1 2 3
## 61 73 76
plot(seeds1)
adjustedRandIndex(as.numeric(data[,8]),seeds1$classification)
## [1] 0.8769483